import numpy as np
import pandas as pd
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import FunctionTransformer
from scipy.stats import chi2_contingency
def read_data(path):
df = pd.read_csv(path)
print(df.columns)
print()
print("the shape is: ", df.shape)
return df
attendance = read_data('attendance.csv')
attendance.head()
Index(['home_team', 'away_team', 'Attendance', 'Date'], dtype='object') the shape is: (964, 4)
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 0 | Argentina | France | 88966 | 12/18/2022 |
| 1 | Croatia | Morocco | 44137 | 12/17/2022 |
| 2 | France | Morocco | 68294 | 12/14/2022 |
| 3 | Argentina | Croatia | 88966 | 12/13/2022 |
| 4 | Morocco | Portugal | 44198 | 12/10/2022 |
matches = read_data('worldcup-1.1.0/data-csv/matches.csv')
matches[['stadium_id', 'home_team_name', 'away_team_name', 'match_date']].head()
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
'stage_name', 'group_name', 'group_stage', 'knockout_stage', 'replayed',
'replay', 'match_date', 'match_time', 'stadium_id', 'stadium_name',
'city_name', 'country_name', 'home_team_id', 'home_team_name',
'home_team_code', 'away_team_id', 'away_team_name', 'away_team_code',
'score', 'home_team_score', 'away_team_score', 'home_team_score_margin',
'away_team_score_margin', 'extra_time', 'penalty_shootout',
'score_penalties', 'home_team_score_penalties',
'away_team_score_penalties', 'result', 'home_team_win', 'away_team_win',
'draw'],
dtype='object')
the shape is: (964, 37)
| stadium_id | home_team_name | away_team_name | match_date | |
|---|---|---|---|---|
| 0 | S-193 | France | Mexico | 7/13/1930 |
| 1 | S-192 | United States | Belgium | 7/13/1930 |
| 2 | S-192 | Yugoslavia | Brazil | 7/14/1930 |
| 3 | S-193 | Romania | Peru | 7/14/1930 |
| 4 | S-192 | Argentina | France | 7/15/1930 |
stadiums = read_data('worldcup-1.1.0/data-csv/stadiums.csv')
stadiums.head()
Index(['key_id', 'stadium_id', 'stadium_name', 'city_name', 'country_name',
'stadium_capacity', 'stadium_wikipedia_link', 'city_wikipedia_link'],
dtype='object')
the shape is: (193, 8)
| key_id | stadium_id | stadium_name | city_name | country_name | stadium_capacity | stadium_wikipedia_link | city_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | S-001 | Estadio José Amalfitani | Buenos Aires | Argentina | 49000 | https://en.wikipedia.org/wiki/José_Amalfitani_... | https://en.wikipedia.org/wiki/Buenos_Aires |
| 1 | 2 | S-002 | Estadio Monumental | Buenos Aires | Argentina | 75000 | https://en.wikipedia.org/wiki/Estadio_Monument... | https://en.wikipedia.org/wiki/Buenos_Aires |
| 2 | 3 | S-003 | Estadio Chateau Carreras | Córdoba | Argentina | 47000 | https://en.wikipedia.org/wiki/Estadio_Mario_Al... | https://en.wikipedia.org/wiki/Córdoba,_Argentina |
| 3 | 4 | S-004 | Estadio José María Minella | Mar del Plata | Argentina | 44000 | https://en.wikipedia.org/wiki/Estadio_José_Mar... | https://en.wikipedia.org/wiki/Mar_del_Plata |
| 4 | 5 | S-005 | Estadio Ciudad de Mendoza | Mendoza | Argentina | 35000 | https://en.wikipedia.org/wiki/Estadio_Malvinas... | https://en.wikipedia.org/wiki/Mendoza,_Argentina |
matches = pd.merge(
matches,
stadiums[['stadium_id', 'stadium_capacity']],
on='stadium_id',
how='inner'
)
matches = matches.sort_values('key_id').reset_index(drop=True)
matches.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | extra_time | penalty_shootout | score_penalties | home_team_score_penalties | away_team_score_penalties | result | home_team_win | away_team_win | draw | stadium_capacity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 10000 |
| 1 | 2 | WC-1930 | 1930 FIFA World Cup | M-1930-02 | United States v Belgium | group stage | Group 4 | 1 | 0 | 0 | ... | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 20000 |
| 2 | 3 | WC-1930 | 1930 FIFA World Cup | M-1930-03 | Yugoslavia v Brazil | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 20000 |
| 3 | 4 | WC-1930 | 1930 FIFA World Cup | M-1930-04 | Romania v Peru | group stage | Group 3 | 1 | 0 | 0 | ... | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 10000 |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | M-1930-05 | Argentina v France | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | 0 | 0-0 | 0 | 0 | home team win | 1 | 0 | 0 | 20000 |
5 rows × 38 columns
matches.shape
(964, 38)
عند الدمج مع الحضور تم ملاحظة أن عدد الأسطر بعد الدمج لا يساوي عدد الأسطر قبل الدمج
لذلك تم إنشاء تابع يقوم بإيجاد الفرق وفقا للأعمدة المعطاة في كل مرة
def check_differences(matches_lookups, attendance_lookups):
df1 = matches[[lookup for lookup in matches_lookups]]
s1 = {tuple(x) for x in df1.to_numpy()}
df2 = attendance[[lookup for lookup in attendance_lookups]]
s2 = {tuple(x) for x in df2.to_numpy()}
return s1 - s2, s2 - s1
أولاً تفقد أسماء الفرق في الذهاب والإياب
check_differences(['home_team_name'], ['home_team'])
({('China',),
('East Germany',),
('Iran',),
('Ivory Coast',),
('North Korea',),
('South Korea',),
('Turkey',)},
{('China PR',),
("Côte d'Ivoire",),
('FR Yugoslavia',),
('Germany DR',),
('IR Iran',),
('Korea DPR',),
('Korea Republic',),
('Türkiye',)})
check_differences(['away_team_name'], ['away_team'])
({('China',),
('East Germany',),
('Iran',),
('Ivory Coast',),
('North Korea',),
('South Korea',),
('Turkey',)},
{('China PR',),
("Côte d'Ivoire",),
('FR Yugoslavia',),
('Germany DR',),
('IR Iran',),
('Korea DPR',),
('Korea Republic',),
('Türkiye',)})
هناك اختلاف بأسماء بعض الفرق، سنقوم بتوحيد الأسماء في كلا الجدولين
ملاحظة: في كلا الجدولين يوجد بعض المباريات التي فيها West Germany, Germany ولكن لأجل نفس المباراة قد يحوي الجدول الأول Germany بينما الجدول الثاني West Germany
print(
attendance['home_team'][attendance['home_team'] == 'West Germany'].count()
+ attendance['home_team'][attendance['home_team'] == 'Germany'].count()
+ attendance['away_team'][attendance['away_team'] == 'West Germany'].count()
+ attendance['away_team'][attendance['away_team'] == 'Germany'].count()
)
print(
matches['home_team_name'][matches['home_team_name']
== 'West Germany'].count()
+ matches['home_team_name'][matches['home_team_name'] == 'Germany'].count()
+ matches['away_team_name'][matches['away_team_name']
== 'West Germany'].count()
+ matches['away_team_name'][matches['away_team_name'] == 'Germany'].count()
)
112 112
هناك 7 دول لم تعد موجودة، وقامت الفيفا بتحديد من يأخذ سجلات الفريق الذي لم يعد موجودا
سيتم استعراض هذه الدول في القسم الثاني من القاموس التالي
https://www.dailystar.co.uk/sport/football/countries-not-exist-world-cup-28504115
country_map = {
'China PR': 'China',
'Côte d\'Ivoire': 'Ivory Coast',
'IR Iran': 'Iran',
'Türkiye': 'Turkey',
'Germany DR': 'East Germany',
'Korea DPR': 'North Korea',
'Korea Republic': 'South Korea',
# united/separated countries with countries that inherited their records according to FIFA
'Soviet Union': 'Russia',
'West Germany': 'Germany', # Germany only inherited West Germany's record, not East Germany
'FR Yugoslavia': 'Serbia',
'Yugoslavia': 'Serbia',
'Czechoslovakia': 'Czech Republic',
'Dutch East Indies': 'Indonesia',
'Zaire': 'DR Congo'
}
توحيد الأسماء في كلا الجدولين
def unify_teams_names(data, columns):
for column in columns:
data[column] = data[column].apply(lambda x: country_map.get(x, x))
def unify_matches_names(data, columns, delimiter=' v '):
for column in columns:
data[column] = data[column].apply(
lambda x: delimiter.join(country_map.get(x, x) for x in x.split(delimiter))
)
unify_teams_names(attendance, ['home_team', 'away_team'])
unify_teams_names(matches, ['home_team_name', 'away_team_name'])
unify_matches_names(matches, ['home_team_name', 'away_team_name'])
ثانياً تفقد تاريخ المباريات: لا يوجد فروقات
check_differences(['match_date'], ['Date'])
(set(), set())
رغم كل ما سبق، لا زال يوجد اختلافات لذلك سننظر للثلاث أعمدة في نفس الوقت
diff1, diff2 = check_differences(
['home_team_name', 'away_team_name', 'match_date'],
['home_team', 'away_team', 'Date']
)
diff1 = list(diff1)
diff1.sort(key=lambda x: x[2])
diff1
[('Argentina', 'Italy', '6/10/1978'),
('Russia', 'Uruguay', '6/14/1970'),
('Italy', 'Germany', '6/14/1978'),
('Austria', 'Netherlands', '6/14/1978'),
('Peru', 'Poland', '6/18/1978'),
('Netherlands', 'Germany', '6/18/1978'),
('England', 'Switzerland', '6/20/1954'),
('Italy', 'Netherlands', '6/21/1978'),
('Spain', 'Northern Ireland', '6/25/1982'),
('Austria', 'Switzerland', '6/26/1954'),
('Belgium', 'France', '6/28/1986'),
('Brazil', 'Sweden', '6/3/1978'),
('Republic of Ireland', 'Italy', '6/30/1990'),
('Argentina', 'Serbia', '6/30/1990'),
('Czech Republic', 'Germany', '7/1/1990'),
('Cameroon', 'England', '7/1/1990'),
('Chile', 'North Korea', '7/15/1966'),
('Argentina', 'Germany', '7/16/1966'),
('Mexico', 'Uruguay', '7/19/1966'),
('Argentina', 'Italy', '7/3/1990'),
('Russia', 'Poland', '7/4/1982')]
diff2 = list(diff2)
diff2.sort(key=lambda x: x[2])
diff2
[('Italy', 'Argentina', '6/10/1978'),
('Uruguay', 'Russia', '6/14/1970'),
('Germany', 'Italy', '6/14/1978'),
('Netherlands', 'Austria', '6/14/1978'),
('Poland', 'Peru', '6/18/1978'),
('Germany', 'Netherlands', '6/18/1978'),
('Switzerland', 'England', '6/20/1954'),
('Netherlands', 'Italy', '6/21/1978'),
('Northern Ireland', 'Spain', '6/25/1982'),
('Switzerland', 'Austria', '6/26/1954'),
('France', 'Belgium', '6/28/1986'),
('Sweden', 'Brazil', '6/3/1978'),
('Italy', 'Republic of Ireland', '6/30/1990'),
('Serbia', 'Argentina', '6/30/1990'),
('England', 'Cameroon', '7/1/1990'),
('Germany', 'Czech Republic', '7/1/1990'),
('North Korea', 'Chile', '7/15/1966'),
('Germany', 'Argentina', '7/16/1966'),
('Uruguay', 'Mexico', '7/19/1966'),
('Italy', 'Argentina', '7/3/1990'),
('Poland', 'Russia', '7/4/1982')]
نلاحظ أن الفروقات ناتجة عن تبديل فرق الذهاب والإياب لبعض المباريات لذلك سنقوم بعكسهم
swapped_attendance = attendance[
attendance[
['home_team', 'away_team', 'Date']
].apply(tuple, 1).isin(diff2)
]
swapped_attendance.head()
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 503 | Italy | Argentina | 59978 | 7/3/1990 |
| 504 | Germany | Czech Republic | 73347 | 7/1/1990 |
| 505 | England | Cameroon | 55205 | 7/1/1990 |
| 506 | Serbia | Argentina | 38971 | 6/30/1990 |
| 507 | Italy | Republic of Ireland | 73303 | 6/30/1990 |
attendance.loc[
swapped_attendance.index,
['home_team', 'away_team']
] = attendance.loc[
swapped_attendance.index,
['away_team', 'home_team']
].values
attendance[500:505]
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 500 | Germany | Argentina | 73603 | 7/8/1990 |
| 501 | Italy | England | 51426 | 7/7/1990 |
| 502 | Germany | England | 62628 | 7/4/1990 |
| 503 | Argentina | Italy | 59978 | 7/3/1990 |
| 504 | Czech Republic | Germany | 73347 | 7/1/1990 |
matches = pd.merge(
matches, attendance,
left_on=['home_team_name', 'away_team_name', 'match_date'],
right_on=['home_team', 'away_team', 'Date'],
how='inner'
)
matches.head()
| key_id | tournament_id | tournament_name | match_id | match_name | stage_name | group_name | group_stage | knockout_stage | replayed | ... | away_team_score_penalties | result | home_team_win | away_team_win | draw | stadium_capacity | home_team | away_team | Attendance | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | M-1930-01 | France v Mexico | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 10000 | France | Mexico | 4444 | 7/13/1930 |
| 1 | 2 | WC-1930 | 1930 FIFA World Cup | M-1930-02 | United States v Belgium | group stage | Group 4 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 20000 | United States | Belgium | 18346 | 7/13/1930 |
| 2 | 3 | WC-1930 | 1930 FIFA World Cup | M-1930-03 | Yugoslavia v Brazil | group stage | Group 2 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 20000 | Serbia | Brazil | 24059 | 7/14/1930 |
| 3 | 4 | WC-1930 | 1930 FIFA World Cup | M-1930-04 | Romania v Peru | group stage | Group 3 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 10000 | Romania | Peru | 2549 | 7/14/1930 |
| 4 | 5 | WC-1930 | 1930 FIFA World Cup | M-1930-05 | Argentina v France | group stage | Group 1 | 1 | 0 | 0 | ... | 0 | home team win | 1 | 0 | 0 | 20000 | Argentina | France | 23409 | 7/15/1930 |
5 rows × 42 columns
تابع للتفقد من وجود قيم فارغة ضمن عمود، والأخر على مستوى جميع الأعمدة
def check_na(data, column):
num_nan = data[column].isna().sum()
if num_nan:
raise Exception(f"There is {num_nan} nan values in {column}")
def check_all_na(data):
for column in data.columns:
check_na(data, column)
check_all_na(matches)
matches.shape
(964, 42)
shape_before_drop = matches.shape
matches.drop_duplicates(inplace=True)
shape_before_drop == matches.shape
True
players = read_data('worldcup-1.1.0/data-csv/players.csv')
players.head()
Index(['key_id', 'player_id', 'family_name', 'given_name', 'birth_date',
'goal_keeper', 'defender', 'midfielder', 'forward', 'count_tournaments',
'list_tournaments', 'player_wikipedia_link'],
dtype='object')
the shape is: (8485, 12)
| key_id | player_id | family_name | given_name | birth_date | goal_keeper | defender | midfielder | forward | count_tournaments | list_tournaments | player_wikipedia_link | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | P-08891 | A'Court | Alan | 1934-09-30 | 0 | 0 | 0 | 1 | 1 | 1958 | https://en.wikipedia.org/wiki/Alan_A%27Court |
| 1 | 2 | P-08589 | Aaronson | Brenden | 2000-10-22 | 0 | 0 | 0 | 1 | 1 | 2022 | https://en.wikipedia.org/wiki/Brenden_Aaronson |
| 2 | 3 | P-04897 | Abadzhiev | Stefan | 1934-07-03 | 0 | 0 | 0 | 1 | 1 | 1966 | https://en.wikipedia.org/wiki/Stefan_Abadzhiev |
| 3 | 4 | P-05556 | Abalo | Jean-Paul | 1975-06-26 | 0 | 1 | 0 | 0 | 1 | 2006 | https://en.wikipedia.org/wiki/Jean-Paul_Abalo |
| 4 | 5 | P-08163 | Abanda | Patrice | 1978-08-03 | 0 | 1 | 0 | 0 | 1 | 1998 | https://en.wikipedia.org/wiki/Patrice_Abanda |
squads = read_data('worldcup-1.1.0/data-csv/squads.csv')
squads[squads['team_name'] == 'Russia'].head()
Index(['key_id', 'tournament_id', 'tournament_name', 'team_id', 'team_name',
'team_code', 'player_id', 'family_name', 'given_name', 'shirt_number',
'position_name', 'position_code'],
dtype='object')
the shape is: (10973, 12)
| key_id | tournament_id | tournament_name | team_id | team_name | team_code | player_id | family_name | given_name | shirt_number | position_name | position_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5603 | 5604 | WC-1994 | 1994 FIFA World Cup | T-60 | Russia | RUS | P-09869 | Cherchesov | Stanislav | 1 | goal keeper | GK |
| 5604 | 5605 | WC-1994 | 1994 FIFA World Cup | T-60 | Russia | RUS | P-03790 | Kuznetsov | Dmitri | 2 | defender | DF |
| 5605 | 5606 | WC-1994 | 1994 FIFA World Cup | T-60 | Russia | RUS | P-06666 | Gorlukovich | Sergei | 3 | defender | DF |
| 5606 | 5607 | WC-1994 | 1994 FIFA World Cup | T-60 | Russia | RUS | P-04883 | Galiamin | Dmitri | 4 | defender | DF |
| 5607 | 5608 | WC-1994 | 1994 FIFA World Cup | T-60 | Russia | RUS | P-02169 | Nikiforov | Yuri | 5 | defender | DF |
teams_codes = dict(zip(squads['team_name'], squads['team_code']))
teams_codes['Indonesia'] = teams_codes['Dutch East Indies']
teams_codes['DR Congo'] = teams_codes['Zaire']
list(teams_codes.items())[:5]
[('Argentina', 'ARG'),
('Belgium', 'BEL'),
('Bolivia', 'BOL'),
('Brazil', 'BRA'),
('Chile', 'CHL')]
teams_ids = dict(zip(squads['team_name'], squads['team_id']))
teams_ids['Indonesia'] = teams_ids['Dutch East Indies']
teams_ids['DR Congo'] = teams_ids['Zaire']
def unify_according_to_team_name(my_dict):
def unify(data, columns):
for column in columns:
team_name, mapped_attribute = column
data[mapped_attribute] = data[[team_name, mapped_attribute]].apply(
lambda x: my_dict[
country_map.get(x[team_name], x[team_name])
], axis=1
)
return unify
unify_teams_codes = unify_according_to_team_name(teams_codes)
unify_teams_ids = unify_according_to_team_name(teams_ids)
unify_teams_names(squads, ['team_name'])
unify_teams_codes(squads, [('team_name', 'team_code')])
unify_teams_ids(squads, [('team_name', 'team_id')])
squads[squads['team_name'] == 'Russia'].head()
| key_id | tournament_id | tournament_name | team_id | team_name | team_code | player_id | family_name | given_name | shirt_number | position_name | position_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1780 | 1781 | WC-1958 | 1958 FIFA World Cup | T-60 | Russia | RUS | P-07374 | Yashin | Lev | 1 | goal keeper | GK |
| 1781 | 1782 | WC-1958 | 1958 FIFA World Cup | T-60 | Russia | RUS | P-00011 | Kesarev | Vladimir | 2 | defender | DF |
| 1782 | 1783 | WC-1958 | 1958 FIFA World Cup | T-60 | Russia | RUS | P-05169 | Krizhevsky | Konstantin | 3 | midfielder | MF |
| 1783 | 1784 | WC-1958 | 1958 FIFA World Cup | T-60 | Russia | RUS | P-08404 | Kuznetsov | Boris | 4 | defender | DF |
| 1784 | 1785 | WC-1958 | 1958 FIFA World Cup | T-60 | Russia | RUS | P-05220 | Voinov | Yuri | 5 | defender | DF |
unify_teams_codes(matches, [
('home_team_name', 'home_team_code'),
('away_team_name', 'away_team_code')
])
unify_teams_ids(matches, [
('home_team_name', 'home_team_id'),
('away_team_name', 'away_team_id')
])
players_teams = pd.merge(
players[[
'player_id', 'family_name', 'given_name',
'count_tournaments', 'list_tournaments'
]],
squads[['player_id', 'tournament_name', 'team_name', 'team_code']],
on='player_id',
how='inner'
)
players_teams.head()
| player_id | family_name | given_name | count_tournaments | list_tournaments | tournament_name | team_name | team_code | |
|---|---|---|---|---|---|---|---|---|
| 0 | P-08891 | A'Court | Alan | 1 | 1958 | 1958 FIFA World Cup | England | ENG |
| 1 | P-08589 | Aaronson | Brenden | 1 | 2022 | 2022 FIFA World Cup | United States | USA |
| 2 | P-04897 | Abadzhiev | Stefan | 1 | 1966 | 1966 FIFA World Cup | Bulgaria | BGR |
| 3 | P-05556 | Abalo | Jean-Paul | 1 | 2006 | 2006 FIFA World Cup | Togo | TGO |
| 4 | P-08163 | Abanda | Patrice | 1 | 1998 | 1998 FIFA World Cup | Cameroon | CMR |
check_all_na(players_teams)
players_teams.shape
(10973, 8)
players_teams = players_teams.groupby('player_id', as_index=False).agg(
{'player_id': 'first',
'family_name': 'first',
'given_name': 'first',
'count_tournaments': 'first',
'list_tournaments': 'first',
'tournament_name': ', '.join,
'team_name': ', '.join,
'team_code': ', '.join
})
players_teams['teams_num'] = players_teams['team_code'].apply(
lambda x: len(set(x.split(', '))))
check_na(players_teams, 'teams_num')
players_teams.head()
| player_id | family_name | given_name | count_tournaments | list_tournaments | tournament_name | team_name | team_code | teams_num | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | P-00001 | Schumacher | Harald | 2 | 1982, 1986 | 1982 FIFA World Cup, 1986 FIFA World Cup | Germany, Germany | DEU, DEU | 1 |
| 1 | P-00002 | Mohammed Sharif | Ismail | 1 | 1986 | 1986 FIFA World Cup | Iraq | IRQ | 1 |
| 2 | P-00003 | Otávio | not applicable | 1 | 2022 | 2022 FIFA World Cup | Portugal | PRT | 1 |
| 3 | P-00004 | Cortés | Julio César | 3 | 1962, 1966, 1970 | 1962 FIFA World Cup, 1966 FIFA World Cup, 1970... | Uruguay, Uruguay, Uruguay | URY, URY, URY | 1 |
| 4 | P-00005 | Vandersmissen | Guy | 1 | 1982 | 1982 FIFA World Cup | Belgium | BEL | 1 |
players_teams.loc[players_teams['teams_num'] > 1].head()
| player_id | family_name | given_name | count_tournaments | list_tournaments | tournament_name | team_name | team_code | teams_num | |
|---|---|---|---|---|---|---|---|---|---|
| 455 | P-00537 | Wagner | Franz | 2 | 1934, 1938 | 1934 FIFA World Cup, 1938 FIFA World Cup | Austria, Germany | AUT, DEU | 2 |
| 1080 | P-01259 | Puskás | Ferenc | 2 | 1954, 1962 | 1954 FIFA World Cup, 1962 FIFA World Cup | Hungary, Spain | HUN, ESP | 2 |
| 1298 | P-01512 | Altafini | José | 2 | 1958, 1962 | 1958 FIFA World Cup, 1962 FIFA World Cup | Brazil, Italy | BRA, ITA | 2 |
| 1490 | P-01739 | Šuker | Davor | 3 | 1990, 1998, 2002 | 1990 FIFA World Cup, 1998 FIFA World Cup, 2002... | Serbia, Croatia, Croatia | SRB, HRV, HRV | 2 |
| 1502 | P-01757 | Raftl | Rudolf | 2 | 1934, 1938 | 1934 FIFA World Cup, 1938 FIFA World Cup | Austria, Germany | AUT, DEU | 2 |
tournaments = read_data('worldcup-1.1.0/data-csv/tournaments.csv')
tournaments.head()
Index(['key_id', 'tournament_id', 'tournament_name', 'year', 'start_date',
'end_date', 'host_country', 'winner', 'host_won', 'count_teams',
'group_stage', 'second_group_stage', 'final_round', 'round_of_16',
'quarter_finals', 'semi_finals', 'third_place_match', 'final'],
dtype='object')
the shape is: (22, 18)
| key_id | tournament_id | tournament_name | year | start_date | end_date | host_country | winner | host_won | count_teams | group_stage | second_group_stage | final_round | round_of_16 | quarter_finals | semi_finals | third_place_match | final | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | WC-1930 | 1930 FIFA World Cup | 1930 | 1930-07-13 | 1930-07-30 | Uruguay | Uruguay | 1 | 13 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 1 | 2 | WC-1934 | 1934 FIFA World Cup | 1934 | 1934-05-27 | 1934-06-10 | Italy | Italy | 1 | 16 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 |
| 2 | 3 | WC-1938 | 1938 FIFA World Cup | 1938 | 1938-06-04 | 1938-06-19 | France | Italy | 0 | 15 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 |
| 3 | 4 | WC-1950 | 1950 FIFA World Cup | 1950 | 1950-06-24 | 1950-07-16 | Brazil | Uruguay | 0 | 13 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | WC-1954 | 1954 FIFA World Cup | 1954 | 1954-06-16 | 1954-07-04 | Switzerland | West Germany | 0 | 16 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 |
unify_teams_names(tournaments, ['host_country', 'winner'])
check_all_na(tournaments)
matches = pd.merge(
matches,
tournaments[['tournament_id', 'host_country']],
on='tournament_id',
)
check_all_na(matches)
matches.shape
(964, 43)
goals = read_data('worldcup-1.1.0/data-csv/goals.csv')
goals[goals['player_team_name'] == 'Soviet Union'].head()
Index(['key_id', 'goal_id', 'tournament_id', 'tournament_name', 'match_id',
'match_name', 'match_date', 'stage_name', 'group_name', 'team_id',
'team_name', 'team_code', 'home_team', 'away_team', 'player_id',
'family_name', 'given_name', 'shirt_number', 'player_team_id',
'player_team_name', 'player_team_code', 'minute_label',
'minute_regulation', 'minute_stoppage', 'match_period', 'own_goal',
'penalty'],
dtype='object')
the shape is: (2720, 27)
| key_id | goal_id | tournament_id | tournament_name | match_id | match_name | match_date | stage_name | group_name | team_id | ... | shirt_number | player_team_id | player_team_name | player_team_code | minute_label | minute_regulation | minute_stoppage | match_period | own_goal | penalty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 477 | 478 | G-0478 | WC-1958 | 1958 FIFA World Cup | M-1958-08 | Soviet Union v England | 1958-06-08 | group stage | Group 4 | T-70 | ... | 0 | T-70 | Soviet Union | SUN | 13' | 13 | 0 | first half | 0 | 0 |
| 478 | 479 | G-0479 | WC-1958 | 1958 FIFA World Cup | M-1958-08 | Soviet Union v England | 1958-06-08 | group stage | Group 4 | T-70 | ... | 0 | T-70 | Soviet Union | SUN | 56' | 56 | 0 | second half | 0 | 0 |
| 501 | 502 | G-0502 | WC-1958 | 1958 FIFA World Cup | M-1958-15 | Soviet Union v Austria | 1958-06-11 | group stage | Group 4 | T-70 | ... | 0 | T-70 | Soviet Union | SUN | 15' | 15 | 0 | first half | 0 | 0 |
| 502 | 503 | G-0503 | WC-1958 | 1958 FIFA World Cup | M-1958-15 | Soviet Union v Austria | 1958-06-11 | group stage | Group 4 | T-70 | ... | 0 | T-70 | Soviet Union | SUN | 62' | 62 | 0 | second half | 0 | 0 |
| 542 | 543 | G-0543 | WC-1958 | 1958 FIFA World Cup | M-1958-27 | Soviet Union v England | 1958-06-17 | group stage | Group 4 | T-70 | ... | 0 | T-70 | Soviet Union | SUN | 69' | 69 | 0 | second half | 0 | 0 |
5 rows × 27 columns
unify_teams_names(goals, ['player_team_name', 'team_name'])
unify_teams_codes(goals, [
('player_team_name', 'player_team_code'),
('team_name', 'team_code')
])
unify_teams_ids(goals, [
('player_team_name', 'player_team_id'),
('team_name', 'team_id')
])
unify_matches_names(goals, ['match_name'])
check_all_na(goals)
goals[goals['player_team_name'] == 'Russia'].head()
| key_id | goal_id | tournament_id | tournament_name | match_id | match_name | match_date | stage_name | group_name | team_id | ... | shirt_number | player_team_id | player_team_name | player_team_code | minute_label | minute_regulation | minute_stoppage | match_period | own_goal | penalty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 477 | 478 | G-0478 | WC-1958 | 1958 FIFA World Cup | M-1958-08 | Russia v England | 1958-06-08 | group stage | Group 4 | T-60 | ... | 0 | T-60 | Russia | RUS | 13' | 13 | 0 | first half | 0 | 0 |
| 478 | 479 | G-0479 | WC-1958 | 1958 FIFA World Cup | M-1958-08 | Russia v England | 1958-06-08 | group stage | Group 4 | T-60 | ... | 0 | T-60 | Russia | RUS | 56' | 56 | 0 | second half | 0 | 0 |
| 501 | 502 | G-0502 | WC-1958 | 1958 FIFA World Cup | M-1958-15 | Russia v Austria | 1958-06-11 | group stage | Group 4 | T-60 | ... | 0 | T-60 | Russia | RUS | 15' | 15 | 0 | first half | 0 | 0 |
| 502 | 503 | G-0503 | WC-1958 | 1958 FIFA World Cup | M-1958-15 | Russia v Austria | 1958-06-11 | group stage | Group 4 | T-60 | ... | 0 | T-60 | Russia | RUS | 62' | 62 | 0 | second half | 0 | 0 |
| 542 | 543 | G-0543 | WC-1958 | 1958 FIFA World Cup | M-1958-27 | Russia v England | 1958-06-17 | group stage | Group 4 | T-60 | ... | 0 | T-60 | Russia | RUS | 69' | 69 | 0 | second half | 0 | 0 |
5 rows × 27 columns
matches['total_goals_in_match'] =\
matches['home_team_score'] + \
matches['away_team_score']
matches[['total_goals_in_match', 'home_team_score', 'away_team_score']].head()
| total_goals_in_match | home_team_score | away_team_score | |
|---|---|---|---|
| 0 | 5 | 4 | 1 |
| 1 | 3 | 3 | 0 |
| 2 | 3 | 2 | 1 |
| 3 | 4 | 3 | 1 |
| 4 | 1 | 1 | 0 |
matches['host_for_match'] = (matches['host_country'] == matches['home_team_name']) \
| (matches['host_country'] == matches['away_team_name'])
check_na(matches, 'host_for_match')
matches[
['host_for_match', 'host_country', 'home_team_name', 'away_team_name']
][matches['host_country'] == 'Qatar'].head()
| host_for_match | host_country | home_team_name | away_team_name | |
|---|---|---|---|---|
| 900 | True | Qatar | Qatar | Ecuador |
| 901 | False | Qatar | England | Iran |
| 902 | False | Qatar | Senegal | Netherlands |
| 903 | False | Qatar | United States | Wales |
| 904 | False | Qatar | Argentina | Saudi Arabia |
matches['used_capacity_ratio'] = matches['Attendance'] / matches['stadium_capacity']
matches[['home_team_name', 'away_team_name', 'Date', 'stadium_id',
'used_capacity_ratio', 'Attendance', 'stadium_capacity']].head()
| home_team_name | away_team_name | Date | stadium_id | used_capacity_ratio | Attendance | stadium_capacity | |
|---|---|---|---|---|---|---|---|
| 0 | France | Mexico | 7/13/1930 | S-193 | 0.44440 | 4444 | 10000 |
| 1 | United States | Belgium | 7/13/1930 | S-192 | 0.91730 | 18346 | 20000 |
| 2 | Serbia | Brazil | 7/14/1930 | S-192 | 1.20295 | 24059 | 20000 |
| 3 | Romania | Peru | 7/14/1930 | S-193 | 0.25490 | 2549 | 10000 |
| 4 | Argentina | France | 7/15/1930 | S-192 | 1.17045 | 23409 | 20000 |
سيتم تقسيم الأقسام بحيث يكون عدد الأسطر في كل قسم متساوي
لم يتم استخدام المتوسط الحسابي تجنباً للتأثير القيم المتطرفة
matches['Attendance'].quantile([.33, .66])
0.33 35848.33 0.66 51334.08 Name: Attendance, dtype: float64
labels = ['low', 'medium', 'high']
transformer = FunctionTransformer(pd.qcut, kw_args={'q': 3, 'labels': labels})
matches['attendance_category'] = transformer.fit_transform(matches['Attendance'])
matches[['attendance_category', 'Attendance']].tail()
| attendance_category | Attendance | |
|---|---|---|
| 959 | high | 68895 |
| 960 | high | 88966 |
| 961 | high | 68294 |
| 962 | medium | 44137 |
| 963 | high | 88966 |
matches['attendance_category'].value_counts()
low 327 medium 322 high 315 Name: attendance_category, dtype: int64
matches['used_capacity_ratio'].quantile([.33, .66])
0.33 0.813620 0.66 0.975304 Name: used_capacity_ratio, dtype: float64
labels = ['low', 'medium', 'high']
transformer = FunctionTransformer(pd.qcut, kw_args={'q': 3, 'labels': labels})
matches['relative_attendance_category'] = transformer.fit_transform(
matches['used_capacity_ratio']
)
matches[['relative_attendance_category', 'used_capacity_ratio']].tail()
| relative_attendance_category | used_capacity_ratio | |
|---|---|---|
| 959 | high | 0.998478 |
| 960 | high | 0.999618 |
| 961 | high | 0.989768 |
| 962 | medium | 0.959500 |
| 963 | high | 0.999618 |
matches['relative_attendance_category'].value_counts()
low 322 medium 321 high 321 Name: relative_attendance_category, dtype: int64
في البداية تم ملاحظة مشكلة في كأس العالم 2002، حيث تستضيفه دولتان وتم ذكر اسم كوريا وفي الواقع فإن كوريا الجنوبية هي من كانت المستضيفة إلى جانب اليابان
matches_hosts = matches['host_country'].str.replace('Korea, Japan', 'South Korea, Japan')
matches_hosts[matches_hosts == 'South Korea, Japan'].head()
580 South Korea, Japan 581 South Korea, Japan 582 South Korea, Japan 583 South Korea, Japan 584 South Korea, Japan Name: host_country, dtype: object
matches['host_country_code'] = matches_hosts.apply(
lambda x: ', '.join([teams_codes[country] for country in x.split(', ')])
)
check_na(matches, 'host_country_code')
matches[
['host_country', 'host_country_code', 'home_team_name', 'away_team_name']
][matches['host_country'] == 'Korea, Japan'].head()
| host_country | host_country_code | home_team_name | away_team_name | |
|---|---|---|---|---|
| 580 | Korea, Japan | KOR, JPN | France | Senegal |
| 581 | Korea, Japan | KOR, JPN | Republic of Ireland | Cameroon |
| 582 | Korea, Japan | KOR, JPN | Uruguay | Denmark |
| 583 | Korea, Japan | KOR, JPN | Germany | Saudi Arabia |
| 584 | Korea, Japan | KOR, JPN | Argentina | Nigeria |
matches['tournament_year'] = matches['tournament_name'].apply(lambda x: int(x[:4]))
matches[['tournament_year', 'tournament_name']].head()
| tournament_year | tournament_name | |
|---|---|---|
| 0 | 1930 | 1930 FIFA World Cup |
| 1 | 1930 | 1930 FIFA World Cup |
| 2 | 1930 | 1930 FIFA World Cup |
| 3 | 1930 | 1930 FIFA World Cup |
| 4 | 1930 | 1930 FIFA World Cup |
goals['tournament_year'] = goals['tournament_name'].apply(lambda x: int(x[:4]))
goals[['tournament_year', 'tournament_name']].head()
| tournament_year | tournament_name | |
|---|---|---|
| 0 | 1930 | 1930 FIFA World Cup |
| 1 | 1930 | 1930 FIFA World Cup |
| 2 | 1930 | 1930 FIFA World Cup |
| 3 | 1930 | 1930 FIFA World Cup |
| 4 | 1930 | 1930 FIFA World Cup |
given_names = players_teams[['given_name']].value_counts()
display(given_names.head())
given_names.tail()
given_name not applicable 385 Carlos 75 José 61 Luis 58 Mario 49 dtype: int64
given_name Leongino 1 Demy 1 Leonard 1 Denny 1 N'Golo 1 dtype: int64
family_names = players_teams[['family_name']].value_counts()
display(family_names.head())
family_names.tail()
family_name Kim 38 Rodríguez 29 Lee 28 González 25 Silva 18 dtype: int64
family_name Graafland 1 Goyzueta 1 Goycochea 1 Govou 1 Kundé 1 dtype: int64
def generate_full_name(data, given_name_col='given_name', family_name_col='family_name', full_name_col='full_name'):
mask = data[given_name_col] != 'not applicable'
first_name = data[given_name_col].where(mask, '')
data[full_name_col] = first_name + ' ' + data[family_name_col]
check_na(data, full_name_col)
generate_full_name(players_teams)
players_teams[['given_name', 'family_name', 'full_name']].tail()
| given_name | family_name | full_name | |
|---|---|---|---|
| 8480 | Landry | N'Guémo | Landry N'Guémo |
| 8481 | Ramiro | Castillo | Ramiro Castillo |
| 8482 | Jonás | Gutiérrez | Jonás Gutiérrez |
| 8483 | not applicable | Rivaldo | Rivaldo |
| 8484 | Giorgian | De Arrascaeta | Giorgian De Arrascaeta |
generate_full_name(goals)
goals[['given_name', 'family_name', 'full_name']].tail()
| given_name | family_name | full_name | |
|---|---|---|---|
| 2715 | Ángel | Di María | Ángel Di María |
| 2716 | Kylian | Mbappé | Kylian Mbappé |
| 2717 | Kylian | Mbappé | Kylian Mbappé |
| 2718 | Lionel | Messi | Lionel Messi |
| 2719 | Kylian | Mbappé | Kylian Mbappé |
matches['stage_name'].value_counts()
group stage 676 round of 16 97 quarter-finals 70 semi-finals 38 second group stage 36 final 21 third-place match 20 final round 6 Name: stage_name, dtype: int64
يوجد دوري مجموعات والباقي تصنف ضمن دور خروج المغلوب
matches['short_stage_name'] = matches['stage_name'].apply(
lambda x: 'G' if 'group' in x else 'K'
)
matches[['short_stage_name', 'stage_name']][50:55]
| short_stage_name | stage_name | |
|---|---|---|
| 50 | K | semi-finals |
| 51 | K | third-place match |
| 52 | K | final |
| 53 | G | group stage |
| 54 | G | group stage |
tournaments['winner_code'] = tournaments['winner'].map(teams_codes)
check_na(tournaments, 'winner_code')
tournaments[['tournament_name', 'winner', 'winner_code']].tail()
| tournament_name | winner | winner_code | |
|---|---|---|---|
| 17 | 2006 FIFA World Cup | Italy | ITA |
| 18 | 2010 FIFA World Cup | Spain | ESP |
| 19 | 2014 FIFA World Cup | Germany | DEU |
| 20 | 2018 FIFA World Cup | France | FRA |
| 21 | 2022 FIFA World Cup | Argentina | ARG |
set(goals['match_period'].values)
{'extra time, first half',
'extra time, first half, stoppage time',
'extra time, second half',
'extra time, second half, stoppage time',
'first half',
'first half, stoppage time',
'second half',
'second half, stoppage time'}
سيتم اعتبار الهدف المتأخر على أنه الهدف الذي يأتي بأخر 5 دقائق من كل شوط
periods_endings = {
'first half': 45,
'first half, stoppage time': 45,
'second half': 90,
'second half, stoppage time': 90,
'extra time, first half': 105,
'extra time, first half, stoppage time': 105,
'extra time, second half': 120,
'extra time, second half, stoppage time': 120
}
goals['late_goal'] = goals[['match_period', 'minute_regulation']].apply(
lambda x: x['minute_regulation'] >= periods_endings[x['match_period']] - 5,
axis=1
)
goals[goals['minute_regulation'] > 89][['late_goal', 'match_period', 'minute_regulation']][:10]
| late_goal | match_period | minute_regulation | |
|---|---|---|---|
| 72 | False | extra time, first half | 93 |
| 73 | False | extra time, second half | 109 |
| 74 | True | extra time, second half | 116 |
| 98 | True | second half | 90 |
| 139 | False | extra time, first half | 95 |
| 146 | True | extra time, first half | 103 |
| 147 | True | extra time, first half | 105 |
| 160 | False | extra time, first half | 94 |
| 169 | False | extra time, first half | 93 |
| 170 | True | extra time, first half | 104 |
attendance_in_year = matches.groupby(['tournament_year'])['Attendance']
tournament_years = matches['tournament_year'].unique()
fig = px.line(matches, x=tournament_years, y=attendance_in_year.mean())
fig.add_scatter(x=tournament_years, y=attendance_in_year.mean(), name='Mean')
fig.add_scatter(x=tournament_years, y=attendance_in_year.median(), name='Median')
fig.update_layout(width=716, height=350)
fig.show()
matches.loc[matches['tournament_year'] == 1994]['host_country'].unique()
array(['United States'], dtype=object)
fig = go.Figure(
data=go.Histogram(
x=matches['Attendance'], histfunc='count', xbins={'size': 10000}
)
)
fig.update_layout(
xaxis_title='attendance', yaxis_title='count',
width=600, height=400
)
fig.show()
matches.loc[matches['Attendance'] > 130000][
['match_name', 'tournament_year', 'Attendance', 'stage_name', 'host_country',
'used_capacity_ratio', 'stadium_name', 'stadium_capacity']
].sort_values('Attendance', ascending=False)
| match_name | tournament_year | Attendance | stage_name | host_country | used_capacity_ratio | stadium_name | stadium_capacity | |
|---|---|---|---|---|---|---|---|---|
| 74 | Uruguay v Brazil | 1950 | 173850 | final round | Brazil | 0.869250 | Estádio do Maracanã | 200000 |
| 71 | Brazil v Spain | 1950 | 152772 | final round | Brazil | 0.763860 | Estádio do Maracanã | 200000 |
| 63 | Brazil v Yugoslavia | 1950 | 142429 | group stage | Brazil | 0.712145 | Estádio do Maracanã | 200000 |
| 69 | Brazil v Sweden | 1950 | 138886 | final round | Brazil | 0.694430 | Estádio do Maracanã | 200000 |
stadiums[
['stadium_name', 'stadium_capacity']
].sort_values('stadium_capacity', ascending=False).head()
| stadium_name | stadium_capacity | |
|---|---|---|
| 19 | Estádio do Maracanã | 200000 |
| 148 | Camp Nou | 121000 |
| 98 | Estadio Azteca | 115000 |
| 29 | Wembley Stadium | 99000 |
| 186 | Rose Bowl | 94000 |
fig = px.box(matches, y="Attendance", x="tournament_year", width=600, height=400)
fig.show()
في عام 1970 كان التوزع منتظماً، والوسيط أقرب ما يمكن للمتوسط في 1950 و 1960 و 2010 و 2022، نلاحظ الفرق الكبير بين الوسيط و q3
نستنتج من كل ما سبق أنه في الأغلب البطولات توحد قيم متطرفة والتي غالباً ما تمثل نهائيات البطولة
سيتم إجراء الحسابات وفق الدقائق مع الوقت بدل الضائع
goals['minute_regulation_with_stoppage'] = goals['minute_regulation']\
+ goals['minute_stoppage']
goals_by_tournament = goals.groupby('tournament_year')
minutes_meds = goals_by_tournament['minute_regulation_with_stoppage'].median()
fig = px.bar(
x=tournament_years, y=minutes_meds,
labels={'x': 'tournament years', 'y': 'goals minutes median'},
width=600, height=330
)
fig.show()
print(minutes_meds.count())
(minutes_meds <= 60).sum()
22
21
هناك نسختي كأس عالم 1942 و 1946 تم إلغاؤهما بسبب الحرب العالمية
متوسط دقائق الأهداف قريب من متوسط دقائق المباراة أي 45 دقيقة وفقط قيمة واحدة تتجاوز الـ 60 دقيقة
fig = px.histogram(
matches, x="total_goals_in_match",
color='total_goals_in_match',
histfunc='count',
width=600, height=400
)
fig.show()
معظم المباريات تنتهي بهدف أو اثنين أو ثلاثة
ملاحظة: عند أخذ الدقائق دون احتساب الوقت بدل الضائع فمن المنطقي أن 45 و 90 سيكونا من القيم الأكثر تكراراً
goals['minute_regulation'].value_counts().head()
90 111 45 50 75 47 73 45 18 44 Name: minute_regulation, dtype: int64
goals_per_tournament = goals[
['tournament_year', 'minute_regulation_with_stoppage', 'match_period']
].groupby('tournament_year', as_index=False).agg(pd.Series.mode)
goals_per_tournament
| tournament_year | minute_regulation_with_stoppage | match_period | |
|---|---|---|---|
| 0 | 1930 | 65 | second half |
| 1 | 1934 | [18, 29] | [first half, second half] |
| 2 | 1938 | [35, 44, 89] | first half |
| 3 | 1950 | 17 | second half |
| 4 | 1954 | [18, 30, 54, 60, 69, 78, 84, 85] | second half |
| 5 | 1958 | [4, 18, 24, 32, 44, 52, 55] | second half |
| 6 | 1962 | [17, 29, 56, 73, 90] | second half |
| 7 | 1966 | [15, 43, 75, 88] | second half |
| 8 | 1970 | 76 | second half |
| 9 | 1974 | 18 | second half |
| 10 | 1978 | [43, 45] | first half |
| 11 | 1982 | [68, 75, 83] | second half |
| 12 | 1986 | 62 | second half |
| 13 | 1990 | 81 | second half |
| 14 | 1994 | 90 | second half |
| 15 | 1998 | 46 | second half |
| 16 | 2002 | 88 | second half |
| 17 | 2006 | [6, 84] | first half |
| 18 | 2010 | 89 | second half |
| 19 | 2014 | 82 | second half |
| 20 | 2018 | 51 | second half |
| 21 | 2022 | 48 | second half |
النتيجة منطقية، بسبب أن الأشواط الإضافية أقل وقتا ولا يتم دائماً اللجوء لها بالإضافة ووفقا لمخطط وسيط دقيقة الهدف في البطولة فإن جميع القيم أعلى من 45
late_goals_per_tour = goals_by_tournament['late_goal'].sum()
fig = go.Figure(
data=go.Histogram(
x=late_goals_per_tour, histfunc='count', xbins={'size': 1}
)
)
fig.update_layout(
xaxis_title='late goals per tournament', yaxis_title='count',
width=600, height=400
)
fig.show()
لا يوجد تكرارات كثيرة لعدد الأهداف المتأخرة في البطولات بالإضافة إلى امتداد القيم على مجال واسع نسبياً من 8 إلى 42
goals_scorers = goals[[
'player_id', 'full_name', 'goal_id'
]].groupby('player_id', as_index=False).agg({
'full_name': 'first',
'goal_id': 'count'
}).rename(columns={
'goal_id': 'goals_counter'
}).sort_values(by='goals_counter', ascending=False)
goals_scorers.head()
| player_id | full_name | goals_counter | |
|---|---|---|---|
| 775 | P-05224 | Miroslav Klose | 16 |
| 1246 | P-08490 | Ronaldo | 15 |
| 325 | P-02173 | Gerd Müller | 14 |
| 377 | P-02537 | Just Fontaine | 13 |
| 517 | P-03429 | Lionel Messi | 13 |
fig = px.bar(
goals_scorers[:12], x='full_name', y='goals_counter',
width=600, height=400
)
fig.show()
نستنتج شدة التنافس بين اللاعبين على لقب الهداف التاريخي لكأس العالم (وعدم وجود قيم متطرفة)، وأن كليان مبابي سيكون لديه فرصة كبيرة لكسر الرقم القياسي لكلوزة كونه لا يزال بعمر 24 ولديه فرصة للعب كأسين عالم على الأقل
ملاحظة: تم الأخذ بعين الاعتبار أن يكون للبطولة أكثر من هداف كما حدث في 2010
scorers_per_tournament = goals[[
'player_id', 'full_name', 'tournament_year', 'goal_id'
]].groupby(['player_id', 'tournament_year'], as_index=False).agg({
'full_name': 'first',
'goal_id': 'count'
}).rename(columns={
'goal_id': 'goals_counter'
}).sort_values(by='goals_counter', ascending=False)
scorers_per_tournament[scorers_per_tournament['tournament_year'] == 2022].head()
| player_id | tournament_year | full_name | goals_counter | |
|---|---|---|---|---|
| 1237 | P-06978 | 2022 | Kylian Mbappé | 8 |
| 605 | P-03429 | 2022 | Lionel Messi | 7 |
| 995 | P-05731 | 2022 | Julián Álvarez | 4 |
| 1293 | P-07287 | 2022 | Olivier Giroud | 4 |
| 244 | P-01503 | 2022 | Enner Valencia | 3 |
# finding who achieve goals equal to the max value in each tournament
top_scorers_per_tournament_idx = scorers_per_tournament.groupby(
'tournament_year'
)['goals_counter'].transform(max) == scorers_per_tournament['goals_counter']
# grouping by tournament_year
top_scorers_per_tournament = scorers_per_tournament[
top_scorers_per_tournament_idx
].groupby('tournament_year', as_index=False).agg(
{
'full_name': ', '.join,
'goals_counter': 'first'
}
).sort_values(by='tournament_year', ascending=False)
top_scorers_per_tournament.head()
| tournament_year | full_name | goals_counter | |
|---|---|---|---|
| 21 | 2022 | Kylian Mbappé | 8 |
| 20 | 2018 | Harry Kane | 6 |
| 19 | 2014 | James Rodríguez | 6 |
| 18 | 2010 | Thomas Müller, Wesley Sneijder, David Villa, D... | 5 |
| 17 | 2006 | Miroslav Klose | 5 |
fig = px.bar(
top_scorers_per_tournament,
x='tournament_year', y='goals_counter', hover_data='full_name',
width=600, height=400
)
fig.show()
نلاحظ وجود قيمة متطرفة كبيرة في عام 1958 من قبل اللاعب just fontaine
وبالمقارنة مع مخطط الهداف التاريخي لكأس العالم فإن هذا اللاعب لم يسجل سوى في هذه النسخة من البطولة
معظم قيم الأهداف في كل نسخة تتراوح بين 6 و 8
goals_per_tournament = goals[[
'tournament_year', 'goal_id'
]].groupby('tournament_year', as_index=False).agg({
'goal_id': 'count'
}).rename(columns={
'goal_id': 'goals_counter'
})
goals_per_tournament.head()
| tournament_year | goals_counter | |
|---|---|---|
| 0 | 1930 | 70 |
| 1 | 1934 | 70 |
| 2 | 1938 | 84 |
| 3 | 1950 | 88 |
| 4 | 1954 | 140 |
fig = px.bar(
goals_per_tournament,
x='tournament_year', y='goals_counter',
width=600, height=400
)
fig.show()
نلاحظ ارتفاع عددالأهداف المسجلة في كل بطولة مع مرور الزمن السبب الرئيسي لهذه الظاهرة هو ازدياد عدد المباريات كما سنرى في الطلب الرابع
goals_stages = goals[[
'player_team_name', 'minute_regulation_with_stoppage', 'match_id']
].merge(
matches[['match_id', 'stage_name']], on='match_id'
)
BGI_info = goals_stages[
goals_stages['player_team_name'].isin(['Brazil', 'Germany', 'Italy'])
]
BGI_info[28:33]
| player_team_name | minute_regulation_with_stoppage | match_id | stage_name | |
|---|---|---|---|---|
| 139 | Italy | 95 | M-1934-17 | final |
| 140 | Germany | 29 | M-1938-01 | round of 16 |
| 158 | Italy | 2 | M-1938-05 | round of 16 |
| 160 | Italy | 94 | M-1938-05 | round of 16 |
| 161 | Brazil | 18 | M-1938-06 | round of 16 |
fig = px.strip(
BGI_info, y='minute_regulation_with_stoppage',
x='stage_name', color='player_team_name'
)
fig.show()
وهو هدف ماريو غوتزه الذي جلب اللقب عام 2014
تم أخذ الترتيب الأبجدي للفريقين لتجنب التكرار ما بين ذهاب وإياب
matches['match_name'] = matches[['home_team', 'away_team']].apply(
lambda x: ' v '.join(sorted(list(x))), axis=1
)
check_na(matches, 'match_name')
matches[['home_team', 'away_team', 'match_name']][
(matches['home_team'] == 'Brazil') | (matches['away_team'] == 'Brazil')
].head()
| home_team | away_team | match_name | |
|---|---|---|---|
| 2 | Serbia | Brazil | Brazil v Serbia |
| 11 | Brazil | Bolivia | Bolivia v Brazil |
| 23 | Spain | Brazil | Brazil v Spain |
| 40 | Brazil | Poland | Brazil v Poland |
| 44 | Brazil | Czech Republic | Brazil v Czech Republic |
matches_frequency = matches[
['match_name', 'match_id']
].groupby('match_name', as_index=False).count(
).sort_values(
by='match_id', ascending=False
).rename(columns={'match_id': 'matches_frequency'})
matches_frequency.head()
| match_name | matches_frequency | |
|---|---|---|
| 389 | Germany v Serbia | 7 |
| 28 | Argentina v Germany | 7 |
| 155 | Brazil v Sweden | 7 |
| 39 | Argentina v Netherlands | 6 |
| 152 | Brazil v Serbia | 6 |
fig = px.bar(
matches_frequency[:10],
x='match_name', y='matches_frequency',
width=600, height=400
)
fig.show()
players_for_more_than_2_team = players_teams.loc[players_teams['teams_num'] > 1]
players_for_more_than_2_team.shape
(18, 10)
players_for_more_than_2_team.sort_values(
by='player_id'
)[
['player_id', 'full_name', 'team_name', 'teams_num', 'list_tournaments']
]
| player_id | full_name | team_name | teams_num | list_tournaments | |
|---|---|---|---|---|---|
| 455 | P-00537 | Franz Wagner | Austria, Germany | 2 | 1934, 1938 |
| 1080 | P-01259 | Ferenc Puskás | Hungary, Spain | 2 | 1954, 1962 |
| 1298 | P-01512 | José Altafini | Brazil, Italy | 2 | 1958, 1962 |
| 1490 | P-01739 | Davor Šuker | Serbia, Croatia, Croatia | 2 | 1990, 1998, 2002 |
| 1502 | P-01757 | Rudolf Raftl | Austria, Germany | 2 | 1934, 1938 |
| 2016 | P-02369 | Dejan Stanković | Serbia, Serbia and Montenegro, Serbia | 2 | 1998, 2006, 2010 |
| 2129 | P-02502 | Nikola Žigić | Serbia and Montenegro, Serbia | 2 | 2006, 2010 |
| 2163 | P-02543 | Luis Monti | Argentina, Italy | 2 | 1930, 1934 |
| 2191 | P-02576 | Robert Prosinečki | Serbia, Croatia, Croatia | 2 | 1990, 1998, 2002 |
| 2416 | P-02845 | Attilio Demaría | Argentina, Italy | 2 | 1930, 1934 |
| 2725 | P-03207 | Alen Bokšić | Serbia, Croatia | 2 | 1990, 2002 |
| 3900 | P-04605 | Robert Jarni | Serbia, Croatia, Croatia | 2 | 1990, 1998, 2002 |
| 4580 | P-05394 | José Santamaría | Uruguay, Spain | 2 | 1954, 1962 |
| 5355 | P-06299 | Nemanja Vidić | Serbia and Montenegro, Serbia | 2 | 2006, 2010 |
| 7446 | P-08793 | Vladimir Stojković | Serbia and Montenegro, Serbia, Serbia | 2 | 2006, 2010, 2018 |
| 7597 | P-08975 | Josef Stroh | Austria, Germany | 2 | 1934, 1938 |
| 7867 | P-09293 | Savo Milošević | Serbia, Serbia and Montenegro | 2 | 1998, 2006 |
| 8170 | P-09633 | Willibald Schmaus | Austria, Germany | 2 | 1934, 1938 |
أيضاً هناك بعض اللاعبين الذين يملكون عدة جنسيات مثل Luis Monti وهو اللاعب الوحيد الذي فاز بكأس العالم مع بلدين مختلفين وبالتالي كان في الماضي يحق للاعب تمثيل أي منتخب يحمل جنسيته ولكن في 2004 قامت الفيفا بإصدار قوانين للحد من عمليات التجنيس لأجل تمثيل منتخب أخر
def cramers_v(df, col1, col2):
cross_tabs = pd.crosstab(df[col1], df[col2])
# getting the chi sq. stat
chi2 = chi2_contingency(cross_tabs)[0]
# calculating the total number of observations
n = cross_tabs.sum().sum()
# getting the degrees of freedom
dof = min(cross_tabs.shape)-1
# calculating cramer's v
v = np.sqrt(chi2/(n*dof))
# printing results
print(f'V = {v}')
print(f'Cramer\'s V Degrees of Freedom = {dof}')
def chi_sq_test(df, col1, col2):
cross_tabs = pd.crosstab(df[col1], df[col2])
chi2, p, dof, con_table = chi2_contingency(cross_tabs)
print(f'chi-squared = {chi2}\np value= {p}\ndegrees of freedom = {dof}')
chi_sq_test(tournaments, 'host_country', 'winner')
chi-squared = 120.63333333333335 p value= 0.2719475008128004 degrees of freedom = 112
print("the number of tournaments where the host won: ",
tournaments.loc[tournaments['host_won'] == 1].shape[0])
print("the number of all tournamets: ", tournaments.shape[0])
the number of tournaments where the host won: 6 the number of all tournamets: 22
chi_sq_test(matches, 'host_for_match', 'relative_attendance_category')
chi-squared = 6.499723567042024 p value= 0.038779567436588364 degrees of freedom = 2
cramers_v(matches, 'host_for_match', 'relative_attendance_category')
# cramers_v(matches,'host_for_match','used_capacity_ratio')
V = 0.08211243409515483 Cramer's V Degrees of Freedom = 1
chi_sq_test(matches, 'host_country', 'attendance_category')
chi-squared = 366.43607721193973 p value= 1.9692874526706663e-58 degrees of freedom = 32
cramers_v(matches, 'host_country', 'attendance_category')
V = 0.43595894992853435 Cramer's V Degrees of Freedom = 2
عددالمباريات في كل بطولة
matches_per_tournament = matches[
['tournament_year', 'match_id']
].groupby(
'tournament_year', as_index=False).count(
).rename(columns={'match_id': 'matches_counter'})
fig = px.bar(
matches_per_tournament,
x='tournament_year', y='matches_counter',
width=600, height=400
)
fig.show()
النتيجة: ازدياد عدد المباريات عبر الزمن
أكثر 12 لاعب لعباً للمباريات عبر تاريخ كأس العالم
player_appearances = read_data('worldcup-1.1.0/data-csv/player_appearances.csv')
generate_full_name(player_appearances)
check_all_na(player_appearances)
player_appearances[['match_id', 'full_name']].tail()
Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
'match_date', 'stage_name', 'group_name', 'team_id', 'team_name',
'team_code', 'home_team', 'away_team', 'player_id', 'family_name',
'given_name', 'shirt_number', 'position_name', 'position_code',
'starter', 'substitute', 'captain'],
dtype='object')
the shape is: (20618, 22)
| match_id | full_name | |
|---|---|---|
| 20613 | M-2022-64 | Youssouf Fofana |
| 20614 | M-2022-64 | Kingsley Coman |
| 20615 | M-2022-64 | Ibrahima Konaté |
| 20616 | M-2022-64 | Eduardo Camavinga |
| 20617 | M-2022-64 | Marcus Thuram |
appearances = player_appearances[
['player_id', 'match_id', 'full_name']
].groupby('player_id', as_index=False).agg({
'full_name': 'first',
'match_id': 'count'
}).rename(columns={
'match_id': 'appearances_counter'
}).sort_values(by='appearances_counter', ascending=False)
appearances.head()
| player_id | full_name | appearances_counter | |
|---|---|---|---|
| 1774 | P-03429 | Lionel Messi | 26 |
| 4829 | P-09502 | Lothar Matthäus | 25 |
| 2680 | P-05224 | Miroslav Klose | 24 |
| 2644 | P-05132 | Paolo Maldini | 23 |
| 1622 | P-03142 | Cristiano Ronaldo | 22 |
fig = px.bar(
appearances[:10], x='full_name', y='appearances_counter',
width=600, height=400
)
fig.show()
إحصائية مهمة هي المعدل التهديفي للاعب، إذ لا يتم فقط الاهتمام بعدد الأهداف لكل لاعب بل يتم ربطه بعدد المباريات التي لعبها اللاعب
scorers_with_appearances = appearances.merge(
goals_scorers[['player_id', 'goals_counter']], on='player_id'
)
scorers_with_appearances['goals_ratio'] = scorers_with_appearances['goals_counter'] / \
scorers_with_appearances['appearances_counter']
scorers_with_appearances.sort_values('goals_ratio', ascending=False, inplace=True)
scorers_with_appearances.head()
| player_id | full_name | appearances_counter | goals_counter | goals_ratio | |
|---|---|---|---|---|---|
| 1089 | P-08217 | Helmut Haller | 1 | 6 | 6.000000 |
| 997 | P-04183 | Oleg Salenko | 3 | 6 | 2.000000 |
| 420 | P-09516 | Pelé | 6 | 12 | 2.000000 |
| 969 | P-04680 | Geoff Hurst | 3 | 5 | 1.666667 |
| 432 | P-02871 | Uwe Seeler | 6 | 9 | 1.500000 |
سنقوم بعرض المعدلات التهديفية للاعبين الذي يملكون على الأقل 10 أهداف وذلك للمقارنة بين الهدافين التاريخيين
scorers12_with_appearances = scorers_with_appearances[
scorers_with_appearances['goals_counter'] >= 10
]
fig = px.bar(
scorers12_with_appearances[:12],
x='full_name', y='goals_ratio', hover_data=['goals_counter', 'appearances_counter'],
width=600, height=400
)
fig.show()
نلاحظ المعدل التهديفي الرهيب لبيليه والذي يشكل قيمة متطرفة، وعلى سبيل المثال رغم أن ميسي من الهدافين التاريخين ويتفوق على عدة لاعبين مثل مبابي من ناحية عدد الأهداف، إلا أن معدله التهديفي أقل
المنتخبات صاحبة أعلى أهداف
teams_goals = goals[[
'team_id', 'team_name', 'goal_id'
]].groupby('team_id', as_index=False).agg({
'team_name': 'first',
'goal_id': 'count'
}).rename(columns={
'goal_id': 'goals_counter'
}).sort_values('goals_counter', ascending=False)
teams_goals.head()
| team_id | team_name | goals_counter | |
|---|---|---|---|
| 8 | T-09 | Brazil | 237 |
| 25 | T-29 | Germany | 232 |
| 2 | T-03 | Argentina | 152 |
| 24 | T-28 | France | 136 |
| 35 | T-39 | Italy | 128 |
المنتخبات صاحبة أعلى عدد من المباريات في كأس العالم
team_home_matches = matches[
['home_team_id', 'match_id']
].groupby(
'home_team_id', as_index=False
).count().rename(columns={'home_team_id': 'team_id'})
team_away_matches = matches[
['away_team_id', 'match_id']
].groupby(
'away_team_id', as_index=False
).count().rename(columns={'away_team_id': 'team_id'})
team_home_matches.head()
| team_id | match_id | |
|---|---|---|
| 0 | T-01 | 6 |
| 1 | T-02 | 1 |
| 2 | T-03 | 63 |
| 3 | T-04 | 8 |
| 4 | T-05 | 14 |
team_matches = team_home_matches.merge(team_away_matches, on='team_id')
team_matches['appearances_counter'] = team_matches['match_id_x'] + team_matches['match_id_y']
team_matches.drop(
columns=['match_id_x', 'match_id_y'], inplace=True
)
team_matches.head()
| team_id | appearances_counter | |
|---|---|---|
| 0 | T-01 | 13 |
| 1 | T-02 | 3 |
| 2 | T-03 | 88 |
| 3 | T-04 | 20 |
| 4 | T-05 | 29 |
teams_goals_matches = team_matches.merge(
teams_goals, on='team_id'
).sort_values('appearances_counter', ascending=False)
teams_goals_matches.head()
| team_id | appearances_counter | team_name | goals_counter | |
|---|---|---|---|---|
| 8 | T-09 | 114 | Brazil | 237 |
| 24 | T-29 | 112 | Germany | 232 |
| 2 | T-03 | 88 | Argentina | 152 |
| 33 | T-39 | 83 | Italy | 128 |
| 22 | T-27 | 74 | England | 104 |
fig = px.bar(
teams_goals_matches[:12],
x='team_name', y='appearances_counter', hover_data='goals_counter',
width=600, height=400
)
fig.show()
المعدل التهديفي للفرق
teams_goals_matches['goals_ratio'] = teams_goals_matches['goals_counter'] / \
teams_goals_matches['appearances_counter']
teams_goals_ratio = teams_goals_matches.sort_values('goals_ratio', ascending=False)
teams_goals_ratio.head()
| team_id | appearances_counter | team_name | goals_counter | goals_ratio | |
|---|---|---|---|---|---|
| 29 | T-34 | 32 | Hungary | 87 | 2.718750 |
| 8 | T-09 | 114 | Brazil | 237 | 2.078947 |
| 24 | T-29 | 112 | Germany | 232 | 2.071429 |
| 68 | T-77 | 10 | Turkey | 20 | 2.000000 |
| 23 | T-28 | 73 | France | 136 | 1.863014 |
fig = px.bar(
teams_goals_matches[:12],
x='team_name', y='goals_ratio', hover_data=['goals_counter', 'appearances_counter'],
width=600, height=400
)
fig.show()
المنتخبات التي حققت كأس العالم
tournaments_winners = tournaments[
['winner', 'tournament_id']
].groupby(
'winner', as_index=False
).count().rename(columns={
'tournament_id': 'winning_times'
}).sort_values('winning_times', ascending=False)
tournaments_winners
| winner | winning_times | |
|---|---|---|
| 1 | Brazil | 5 |
| 4 | Germany | 4 |
| 5 | Italy | 4 |
| 0 | Argentina | 3 |
| 3 | France | 2 |
| 7 | Uruguay | 2 |
| 2 | England | 1 |
| 6 | Spain | 1 |
دراسة حالة الارتباط لعدد مرات بالبطولة مع كل من عدد المباريات والأهداف
teams_goals_with_wins = teams_goals_matches.merge(
tournaments_winners.rename(columns={'winner': 'team_name'}),
on='team_name', how='outer'
)
teams_goals_with_wins['winning_times'].fillna(0, inplace=True)
teams_goals_with_wins[:10]
| team_id | appearances_counter | team_name | goals_counter | goals_ratio | winning_times | |
|---|---|---|---|---|---|---|
| 0 | T-09 | 114 | Brazil | 237 | 2.078947 | 5.0 |
| 1 | T-29 | 112 | Germany | 232 | 2.071429 | 4.0 |
| 2 | T-03 | 88 | Argentina | 152 | 1.727273 | 3.0 |
| 3 | T-39 | 83 | Italy | 128 | 1.542169 | 4.0 |
| 4 | T-27 | 74 | England | 104 | 1.405405 | 1.0 |
| 5 | T-28 | 73 | France | 136 | 1.863014 | 2.0 |
| 6 | T-71 | 67 | Spain | 108 | 1.611940 | 1.0 |
| 7 | T-44 | 60 | Mexico | 62 | 1.033333 | 0.0 |
| 8 | T-81 | 59 | Uruguay | 89 | 1.508475 | 2.0 |
| 9 | T-46 | 55 | Netherlands | 96 | 1.745455 | 0.0 |
teams_goals_with_wins.corr()
C:\Users\Sedra\AppData\Local\Temp\ipykernel_18428\1749428966.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
| appearances_counter | goals_counter | goals_ratio | winning_times | |
|---|---|---|---|---|
| appearances_counter | 1.000000 | 0.969362 | 0.647729 | 0.797015 |
| goals_counter | 0.969362 | 1.000000 | 0.708467 | 0.849483 |
| goals_ratio | 0.647729 | 0.708467 | 1.000000 | 0.439483 |
| winning_times | 0.797015 | 0.849483 | 0.439483 | 1.000000 |
بالتأكيد، فإن هناك ارتباط قوي بين الثلاث سمات مع بعضهم البعض، عدد المباريات وعدد الأهداف وعدد مرات التتويج